The situation of Airbnb houses demand in Boston and Beijing. Compared it with the demand of hotels on Booking.com to futher discover the influence of competitor to Airbnb.
Boston and Beijing Airbnb data: http://insideairbnb.com/get-the-data.html Boston and Beijing Booking data: web crawler
Inside Airbnb is an non-commercial and independent dataset of Airbnb global demand, which includes the first-hand data of Airbnb booking history and reviews of users.
library(tidyverse)
library(magrittr)
library(readr)
library(lubridate)
library(ggthemes)
library(leaflet)
bj_reviews_cleaned <- read.csv('bj_reviews_cleaned.csv')
bos_reviews_cleaned <- read.csv('bos_reviews_cleaned.csv')
bj_list_cleaned <- read.csv('bj_list_cleaned.csv')
bos_list_cleaned <- read.csv('bos_list_cleaned.csv')
bj_calendar_cleaned <- read.csv('bj_calendar_cleaned.csv')
bos_calendar_cleaned <- read.csv('bos_calendar_cleaned.csv')
bj_reviews_yr <- bj_reviews_cleaned %>%
group_by(year) %>%
count(listing_id) %>%
arrange(desc(n))
bos_reviews_yr <- bos_reviews_cleaned%>%
group_by(year) %>%
count(listing_id) %>%
arrange(desc(n))
# bjl %>% select(id, name, number_of_reviews) %>% arrange(desc(number_of_reviews))
# bjl %>% select(id, name, number_of_reviews) %>% arrange(number_of_reviews)
# 23437 listings have reviews for Beijing
# 23437/38814
#bosl %>% select(id, name, number_of_reviews) %>% #arrange(desc(number_of_reviews))
# 3507 listings have reviews for Boston
# 3507/3585
# the rate of review in boston is higher than beijing
ggplot(bj_reviews_yr,aes(x=as.factor(year),y=n))+
geom_jitter(alpha=0.3,aes(color=n)) +
geom_smooth() +
theme_bw()+
scale_fill_brewer() +ylim(0,250) +labs(title = 'How popular is Airbnb in Beijing', subtitle = 'Number of reviews received for a single listing over years',color='# of reviews') + xlab('Year') + ylab('The number of reviews')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(bos_reviews_yr,aes(x=as.factor(year),y=n))+
geom_jitter(alpha=0.3,aes(color=n)) +
geom_smooth() +
theme_bw()+
scale_fill_brewer() +ylim(0,250) +
labs(title = 'How popular is Airbnb in Boston', subtitle = 'Number of reviews received for a single listing over years',color='# of reviews') +xlab('Year') + ylab('The number of reviews')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
bos_list_cleaned_rmna <- bos_list_cleaned %>%
filter(!is.na(review_scores_rating)) %>%
filter(!is.na(host_is_superhost))
bj_list_cleaned_rmna <- bj_list_cleaned %>%
filter(!is.na(review_scores_rating)) %>%
filter(!is.na(host_is_superhost))
bos_list_cleaned_rmna$host_response_rate <- as.numeric(bos_list_cleaned_rmna$host_response_rate)
bj_list_cleaned_rmna$host_response_rate <- as.numeric(bj_list_cleaned_rmna$host_response_rate)
summary(bj_list_cleaned_rmna$host_response_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 2.00 15.48 38.00 58.00
summary(bos_list_cleaned_rmna$host_response_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 2.00 14.53 34.00 36.00
ggplot(bos_list_cleaned_rmna, aes(x=as.factor(host_is_superhost), y=review_scores_rating)) + geom_boxplot() +
theme_bw() +labs(title = 'Review Rating Scores by SuperHost in Boston') + xlab('Super Host or Not') + ylab('Review Rating in 100 pts Scale')
ggplot(bj_list_cleaned_rmna, aes(x=as.factor(host_is_superhost), y=review_scores_rating)) + geom_boxplot() +
theme_bw() +labs(title = 'Review Rating Scores by SuperHost in Beijing') + xlab('Super Host or Not') + ylab('Review Rating in 100 pts Scale')
# ggplot(bj_list_cleaned_rmna, aes(x=as.factor(host_is_superhost), y=host_response_rate)) + geom_boxplot() +
# theme_bw()
#
# ggplot(bos_list_cleaned_rmna, aes(x=as.factor(host_is_superhost), y=host_response_rate)) + geom_boxplot() +
# theme_bw()
ggplot(bos_list_cleaned_rmna,aes(x=host_response_rate,y=review_scores_rating)) +
geom_jitter(aes(color=as.factor(host_is_superhost)),alpha=0.3) +theme_bw() +labs(title = 'Indicators for SuperHost in Boston', subtitle = 'Avg. Rating by Response Rate', color = 'Is SuperHost') + xlab('Host Response Rate') + ylab('Review Rating in 100 pts Scale')
ggplot(bj_list_cleaned_rmna,aes(x=host_response_rate,y=review_scores_rating)) +
geom_jitter(aes(color=as.factor(host_is_superhost)),alpha=0.3) +theme_bw() +
labs(title = 'Indicators for SuperHost in Beijing', subtitle = 'Avg. Rating by Response Rate', color = 'Is SuperHost') + xlab('Host Response Rate') + ylab('Review Rating in 100 pts Scale')
summary(bos_calendar_cleaned$year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2019 2020 2020 2020 2020 2020
avg_price_bj <- bj_calendar_cleaned %>%
group_by(date, wkd) %>%
summarize(avgprice= mean(price, na.rm=T))
avg_price_bos <- bos_calendar_cleaned %>%
group_by(date, wkd) %>%
summarize(avgprice=mean(price,na.rm=T))
ggplot(avg_price_bos, aes(x=wkd, y=avgprice)) +
geom_boxplot() + geom_jitter(alpha=0.2) +
theme_bw() +
labs(title = 'Price Trends over the weekday & weekends in Boston', subtitle = 'avgprice = Avg. price by day') +xlab('weekday & weekends')
ggplot(avg_price_bj,aes(x=wkd,y=avgprice)) +
geom_boxplot(outlier.shape=NA) + geom_jitter(alpha=0.2) +
theme_bw() + coord_cartesian(ylim = c(375, 420))+
labs(title = 'Price Trends over the weekday & weekends in Beijing', subtitle = 'avgprice = Avg. price by day') +xlab('weekday & weekends')
avg_price_bj_2020 <- bj_calendar_cleaned %>%
filter(year == 2020) %>%
group_by(listing_id, month) %>%
summarize(avgprice= mean(price,na.rm=T))
avg_price_bos_2020 <- bos_calendar_cleaned %>%
filter(year == 2020) %>%
group_by(listing_id, month) %>%
summarize(avgprice=mean(price,na.rm=T))
ylim_bos<-boxplot.stats(avg_price_bos_2020$avgprice)$stats[c(1, 5)]
ggplot(avg_price_bos_2020, aes(x = factor(month), y=avgprice)) +
geom_boxplot(outlier.shape = NA) +
coord_cartesian(ylim = ylim_bos * 1.5) +
theme_bw() +
labs(title = 'Price Trends over the month in Boston', subtitle = 'avgprice = Avg. price by listing') +xlab('Months') +ylab('average price')
## Warning: Removed 681 rows containing non-finite values (stat_boxplot).
ylim_bj<-boxplot.stats(avg_price_bj_2020$avgprice)$stats[c(1, 5)]
ggplot(avg_price_bj_2020, aes(x = factor(month), y=avgprice)) +
geom_boxplot(outlier.shape = NA) +
coord_cartesian(ylim = ylim_bos * 2.4) +
theme_bw() +
labs(title = 'Price Trends over the month in Beijing', subtitle = 'avgprice = Avg. price by listing') + xlab('Months') +ylab('average price')
## Warning: Removed 51194 rows containing non-finite values (stat_boxplot).
bosAirbnb <- bos_list_cleaned %>%
mutate(Log1pPrice = log1p(price), transformed_review = bos_list_cleaned$review_scores_rating^5)
bosAirbnb <- bosAirbnb %>% select(-price, -review_scores_rating)
pal <- colorNumeric(palette = rainbow(6), domain = bosAirbnb$Log1pPrice)
leaflet(data = bosAirbnb[is.na(bosAirbnb$Log1pPrice)==FALSE,]) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(~longitude, ~latitude,
color = ~pal(Log1pPrice), weight = 1, radius=1.5,
fillOpacity = 1, opacity = 1,
label = paste("Neighbourhood:", bosAirbnb$neighbourhood_cleansed)) %>%
addLegend("bottomright", pal = pal, values = ~Log1pPrice,
title = "Log1pPrice",
opacity = 1)
BJAirbnb <- bj_list_cleaned %>%
mutate(Log1pPrice = log1p(price), transformed_review = bj_list_cleaned$review_scores_rating^5)
BJAirbnb <- BJAirbnb %>% select(-price, -review_scores_rating)
pal <- colorNumeric(palette = rainbow(6), domain = BJAirbnb$Log1pPrice)
leaflet(data = BJAirbnb[is.na(BJAirbnb$Log1pPrice)==FALSE,]) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(~longitude, ~latitude,
color = ~pal(Log1pPrice), weight = 1, radius=1.5,
fillOpacity = 1, opacity = 1,
label = paste("Neighbourhood:", BJAirbnb$neighbourhood_cleansed)) %>%
addLegend("bottomright", pal = pal, values = ~Log1pPrice,
title = "Log1pPrice",
opacity = 1)